import pickle
import plotly.express as px
import plotly.io as pio
# Defining the custom template
custom_template = {
'layout': {
'plot_bgcolor': '#EDEDED',
'paper_bgcolor': '#EDEDED',
'colorway': ['#203864'],
'xaxis':
{
'tickangle': 45,
'automargin': True
},
'yaxis':
{
'automargin':True
}
}}
pio.templates['custom_template'] = custom_template
pio.templates.default = 'custom_template'
with open('outputs.pkl', 'rb') as f:
outputs = pickle.load(f)
outputs.keys()
dict_keys(['Top 5 Arrival Stations', 'Top 5 Departure Stations', 'Routes', 'Peak Travel Months', 'Peak Travel Weeks', 'Peak Travel Hours', 'peak travel times', 'revenue_by_class', 'revenue_by_type', 'revenue_by_class_and_type', 'revenue_by_class_and_type_crosstab', 'On-Time Performance', 'Reason_for_Delay', 'routemap-delay', 'routemap-delay_crosstab'])
Findings1: Identify the most popular routes
Birmingham New Streetis top Arrival Station
Manchester Piccadillyis the top Departure stationTop 10 most popular routes:
fig = px.bar(data_frame = outputs['Routes'].head(10), x='Route', y='Route_Mapping', title="Most Popular Routes - Top 10", labels={'Route_Mapping':'Passenger Count'})
fig.show()
Findings2: Determine peak travel times
The peak travelling times are as in the following order where Morning is in the first and Early morning is in the last position: (Day is divided into 4 parts)
Morning 34.70%Afternoon 29.42%Evening/Night 20.88%Early Morning 15.00%
March & Januaryare the top travelling months
6 PM & 6 AMare the busiest travelli hours
fig = px.bar(data_frame = outputs['peak travel times'].reset_index(), x = 'index', y= 'Travel_Time_Segment',title="Peak travel times", labels={'index':'Passenger Count', 'Travel_Time_Segment': 'Passenger Count'})
fig.show()
def determine_peak_time(hour):
if 0 <= hour < 6:
return 'Early Morning'
elif 6 <= hour < 12:
return 'Morning'
elif 12 <= hour < 18:
return 'Afternoon'
else:
return 'Evening/Night'
outputs['Peak Travel Hours'] = outputs['Peak Travel Hours'].sort_index().reset_index()
outputs['Peak Travel Hours']['Travel_Time_Segments'] = outputs['Peak Travel Hours'].reset_index()['index'].apply(determine_peak_time)
color_list = ['royalblue', 'dodgerblue', 'darkblue', 'lightskyblue']
fig = px.bar(data_frame = outputs['Peak Travel Hours'].sort_index().reset_index(), y= 'index', x = 'Departure_Time', \
title="Travelling Hours", labels={'index':'Hour', 'Departure_Time': 'Passenger Count'}, \
orientation='h',color = 'Travel_Time_Segments', template='plotly', color_discrete_sequence=color_list)
fig.update_layout(yaxis={'dtick': 1})
fig.show()
Findings3: 3. Analyze revenue from different ticket types & classes
Revenue for Standard class is high compared to First Class, and as well as the number of passenger who bought the ticket & around 50% of the passengers in both Ticket Classes are Railcard holders. So it wont be a factor for low prices in First Class transactions
The revenue for Advance & Off Peak booking is high compared to Anytime - This is interesting because Both have discounts/offer for the price but not for Anytime category. The reason for high revenue could be the no of passenger count in each Ticket Type. Also There are
66%of the passangers are having Railcard from Anytime category so this could be one of the factors for low revenue from Anytime categoryCombination: The Standard Class with Advance Type Bookings are high compared to other
fig = px.pie(outputs['revenue_by_class'], values='sum', names='Ticket_Class', title='Revenue By Class', labels = {'sum':'Passenger Count'})
fig.show()
fig = px.pie(outputs['revenue_by_type'], values='sum', names='Ticket_Type', title='Revenue By Type', labels = {'sum':'Passenger Count'})
fig.show()
color_list = ['royalblue', 'dodgerblue', 'darkblue']
fig = px.bar(outputs['revenue_by_class_and_type'], x="Ticket_Class", y="sum", color="Ticket_Type"\
,title="Revenue by Class & Type", labels = {'sum':'Revenue'}, template='plotly', color_discrete_sequence=color_list)
fig.show()
Findings 4. Diagnose on-time performance and contributing factors
Most of the trains are
On-Time with 86.82%only7.24% are delayedand rest of the percentage are from cancelled trainsContributing factors for the train delay in percentages: Weather is the primary factor for the delays
London Euston - Liverpool Lime Street is havings highest train delays
outputs.keys()
dict_keys(['Top 5 Arrival Stations', 'Top 5 Departure Stations', 'Routes', 'Peak Travel Months', 'Peak Travel Weeks', 'Peak Travel Hours', 'peak travel times', 'revenue_by_class', 'revenue_by_type', 'revenue_by_class_and_type', 'revenue_by_class_and_type_crosstab', 'On-Time Performance', 'Reason_for_Delay', 'routemap-delay', 'routemap-delay_crosstab'])
fig = px.pie(outputs['On-Time Performance'].reset_index(), values='Journey_Status', names='index', title='On-Time Performance',\
labels = {'index':'Journey Status', 'Journey_Status':'Count(in %)'})
fig.show()
Reason for Delay Contributing factors
fig = px.bar(data_frame = outputs['Reason_for_Delay'][1:].reset_index(), x = 'index', y= 'Reason_for_Delay',\
title="Reason for Delay", labels={'index':'Reason_for_Delay', 'Reason_for_Delay': 'Count'})
fig.show()
fig = px.bar(data_frame = outputs['routemap-delay'].head(10), x = 'Route_Mapping', y= 'Count',\
title="Top 10 Routes with highest delays")
fig.show()
fig = px.imshow(outputs['routemap-delay_crosstab'], aspect="auto", color_continuous_scale="Blues_r", title = 'Routemaps with Reason for delays')
fig.show()
#Dash is a python framework created by plotly for creating interactive web applications.
# We will use Dash to create a dashboard
import dash
Dash Mantine Components